{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark import SparkContext\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql import *\n",
    "from pyspark.sql.types import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc=SparkContext(appName=\"json_dataframe\") \n",
    "spark=SparkSession.builder.appName('json_dataframe').master(\"Yct201811021847\").getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_df_from_json():\n",
    "    '''\n",
    "    read的类型是DataFrameReader\n",
    "    '''\n",
    "    #schema = StructType([\n",
    "    #    StructField(\"name\", StringType(), True),    \n",
    "    #    StructField(\"times\", IntegerType(), True),\n",
    "    #    StructField(\"url\", StringType(), True)\n",
    "    #]) \n",
    "    df = spark.read.json('test.json')\n",
    "    return df.select(df.name,df.times.cast('int').alias('times'),df.url)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_df=create_df_from_json()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- name: string (nullable = true)\n",
      " |-- times: integer (nullable = true)\n",
      " |-- url: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "csv_df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(name=u'Google', times=3, url=u'http://www.google.com'),\n",
       " Row(name=u'Baidu', times=2, url=u'http://www.baidu.com'),\n",
       " Row(name=u'bing', times=1, url=u'http://www.bing.com'),\n",
       " Row(name=u'Google', times=4, url=u'http://www.google.com'),\n",
       " Row(name=u'Baidu', times=7, url=u'http://www.baidu.com'),\n",
       " Row(name=u'bing', times=9, url=u'http://www.bing.com'),\n",
       " Row(name=u'Google', times=2, url=u'http://www.google.com'),\n",
       " Row(name=u'Baidu', times=4, url=u'http://www.baidu.com'),\n",
       " Row(name=u'bing', times=3, url=u'http://www.bing.com'),\n",
       " Row(name=u'Google', times=9, url=u'http://www.google.com'),\n",
       " Row(name=u'Baidu', times=3, url=u'http://www.baidu.com'),\n",
       " Row(name=u'bing', times=6, url=u'http://www.bing.com'),\n",
       " Row(name=u'Google', times=123, url=u'http://www.google.com'),\n",
       " Row(name=u'Baidu', times=28, url=u'http://www.baidu.com'),\n",
       " Row(name=u'bing', times=101, url=u'http://www.bing.com'),\n",
       " Row(name=u'Google', times=23, url=u'http://www.google.com'),\n",
       " Row(name=u'Baidu', times=8, url=u'http://www.baidu.com'),\n",
       " Row(name=u'bing', times=11, url=u'http://www.bing.com')]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "csv_df.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.sql.dataframe.DataFrame"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(csv_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+------------------+\n",
      "|  name|        avg(times)|\n",
      "+------+------------------+\n",
      "|  null| 19.27777777777778|\n",
      "| Baidu| 8.666666666666666|\n",
      "|  bing|21.833333333333332|\n",
      "|Google|27.333333333333332|\n",
      "+------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "csv_df.cube('name').avg().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+----------+\n",
      "|  name|sum(times)|\n",
      "+------+----------+\n",
      "|  null|       347|\n",
      "| Baidu|        52|\n",
      "|  bing|       131|\n",
      "|Google|       164|\n",
      "+------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "csv_df.cube('name').sum().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+----------+\n",
      "|  name|max(times)|\n",
      "+------+----------+\n",
      "|  null|       123|\n",
      "| Baidu|        28|\n",
      "|  bing|       101|\n",
      "|Google|       123|\n",
      "+------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "csv_df.cube('name').max().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_df_from_rdd():\n",
    "    # 从集合中创建新的RDD\n",
    "    stringCSVRDD = spark.sparkContext.parallelize([\n",
    "                    (123, \"Katie\", 19, \"brown\"),\n",
    "                    (456, \"Michael\", 22, \"green\"),\n",
    "                    (789, \"Simone\", 23, \"blue\")])\n",
    "    # 设置dataFrame将要使用的数据模型，定义列名，类型和是否为能为空\n",
    "    schema = StructType([StructField(\"id\", LongType(), True),\n",
    "                        StructField(\"name\", StringType(), True),\n",
    "                        StructField(\"age\", LongType(), True),\n",
    "                        StructField(\"eyeColor\", StringType(), True)])\n",
    "    # 创建DataFrame\n",
    "    df = spark.createDataFrame(stringCSVRDD,schema)\n",
    "    return df\n",
    "swimmers= create_df_from_rdd()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- id: long (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- age: long (nullable = true)\n",
      " |-- eyeColor: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "swimmers.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(id=789, name=u'Simone', id=789, age=23, eyeColor=u'blue'),\n",
       " Row(id=456, name=u'Michael', id=456, age=22, eyeColor=u'green'),\n",
       " Row(id=123, name=u'Katie', id=123, age=19, eyeColor=u'brown')]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "swimmers.select(swimmers.id,swimmers.name,swimmers.id,swimmers.age,swimmers.eyeColor).orderBy(swimmers.age.desc()).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "df=spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], [\"name\", \"height\"])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
